import pandas as pd
from textblob import TextBlob
import nltk
from spacy.lang.en import STOP_WORDS
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
nlp=spacy.load("en_core_web_md")
english_stopwords=STOP_WORDS
#loading the dataset
data=pd.read_csv("assignment.csv")
#reviewing top 10 rows
data.head(10)
data.info()
#Finding Uniques in each Columns of Training set
for i in data.columns:
print('Total unique in',i,'=',data[i].nunique())
#Finding Uniques in each Columns of Training set
for i in data.columns:
print('Total null values in',i,'=',data[i].isnull().sum())
data.isnull().sum().plot(kind='bar')
Review_text has 75 null values
Converting all the text data into lowercase
# data['raw_text']=data['raw_text'].str.lower()
# data['review_text']=data['review_text'].str.lower()
As we have seen that there are 2 text columns are given i.e raw_text and review_text and both are in different languages. column with raw_text has English but the review_text has some other language which we don't know. So, we have to identify that language
#function to detect Language
def detect_lang(text):
a=TextBlob(str(text))
result=a.detect_language()
return result
print("detected language of raw_text is: ",detect_lang(data['raw_text']))
print("\n detected language of review_text is: ",detect_lang(data['review_text']))
From above output we get to know that other language is Indonesian. Now we will see that the information present in both raw_text and review_text are same or different. We will convert first 5 sentences of Indonesian to English
#function to convert Language
def convert(text):
a=TextBlob(str(text))
result=a.translate(to='en')
return result
#Reviewing top 5 sentences from English and comparing it with Indonesian language
print("top 5 sentences of raw_text is: \n ",data['raw_text'][:10])
print("\n top 5 sentences of review_text is:\n ",convert(data['review_text'][:10]))
data_english=pd.DataFrame(data['raw_text'])
data_indonesia=pd.DataFrame(data['review_text'])
Finding similarity between the two documents using Spacy
doc_eng=nlp(str(data_english))
doc_indo=nlp(str(data_indonesia))
a=doc_eng.similarity(doc_indo)
print('% of similar document: ',np.round(a*100,decimals=2))
labels=['Similarity', 'Not Similar' ]
sizes = [0.27,0.74]
explode = (0.1,0) # explode 1st slice
# Plot
plt.pie(sizes, explode=explode, labels=labels, colors=plt.cm.Set2.colors,autopct='%2.2f%%', shadow=True, startangle=140)
plt.axis('equal')
plt.show()
From the graph we have seen that only 26.73% of Text is similar
First dealing with the English data
import string
def feature(data) :
## Finding the total number of words present in the each sentences
data['word_count'] = data['raw_text'].apply(lambda x : len(x.split()))
#total number of Characters in a sentence
data['char_count'] = data['raw_text'].apply(lambda x : len(x.replace(" ","")))
#calculating word density
data['word_density'] = data['word_count'] / (data['char_count'] + 1)
#calculating punctuations in sentences
data['punctuation']=data['raw_text'].apply(lambda x: len([x for x in x.split() if x in string.punctuation]))
# calculating total length of sentence
data['total_length'] = data['raw_text'].apply(len)
#Total number of Numeric characters
data['numerics']=data['raw_text'].apply(lambda x: len([x for x in x if x.isdigit()]))
#findng total number of stopwords in each sentence
data['stopwords']=data['raw_text'].apply(lambda x:len([x for x in x.lower().split() if x in STOP_WORDS]))
# finding total number of capital letters in sentence
data['capitals']=data['raw_text'].apply(lambda x: len([x for x in x.split(" ") if x.isupper()]))
data['caps_vs_length'] = data.apply(lambda row: float(row['capitals'])/float(row['total_length']),axis=1)
#calculating total number of exclamation_marks in sentence
data['num_exclamation_marks'] =data['raw_text'].apply(lambda x: x.count('!'))
#calculating total number of question_marks in sentence
data['num_question_marks'] = data['raw_text'].apply(lambda x: x.count('?'))
#calculating punctuations in sentences
data['num_punctuation'] = data['raw_text'].apply(lambda x: sum(x.count(w) for w in '.,;:'))
#calculating total number of symbols in sentences
data['num_symbols'] = data['raw_text'].apply(lambda x: sum(x.count(w) for w in '*&#$%'))
#calculating total number of unique words in sentences
data['num_unique_words'] = data['raw_text'].apply(lambda x: len(set(w for w in x.split())))
#calculating words_vs_unique
data['words_vs_unique'] = data['num_unique_words'] / data['word_count']
#calculating word_unique_percent
data["word_unique_percent"] = data["num_unique_words"]*100/data['word_count']
#calculating Sentiment
data['polarity']=data['raw_text'].apply(lambda text: TextBlob(text).sentiment.polarity)
return data
feature(data_english)
data_english.describe()
import holoviews as hv
import hvplot.dask
hv.extension('bokeh')
hist=hv.Histogram(np.histogram(data_english['word_count'],bins=100, normed=False),label='Histogram of Word Count')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='red',width=600, height=380, logy=False, tools=['hover'],xlabel=' Word Count')
from above graph we can see that most of the sentences has words between 5 to 25
hist=hv.Histogram(np.histogram(data_english['char_count'],bins=100, normed=False),label='Histogram of Charater Count')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='green',width=600,height=380, logy=False, tools=['hover'],xlabel=' Charater Count')
from above graph we can see that most of the sentences has characters between 17 to 110
hist=hv.Histogram(np.histogram(data_english['word_density'],bins=100, normed=False),label='Histogram of Word Density')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='blue',width=600,height=380, logy=False, tools=['hover'],xlabel=' Word Density')
hist=hv.Histogram(np.histogram(data_english['total_length'],bins=100, normed=False),label='Histogram of Total Length')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='maroon',width=600,height=380, logy=False, tools=['hover'],xlabel='Total Length')
from above graph we can see that most of the sentences has lenth between 21 to 130
a=data_english.numerics.value_counts()
bar=hv.Bars(a,label='Numeric Value Count')
bar.opts(width=600,height=380, logy=False, tools=['hover'],xlabel='Numeric Value', ylabel='Sentences Count')
Most of the sentences have 0 numeric value, 2453 sentences has 1 numeric value and so on
hist=hv.Histogram(np.histogram(data_english['capitals'],bins=100, normed=False),label='Histogram of Capital Charater Count')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='indigo',width=600,height=380, logy=False, tools=['hover'],xlabel=' Capital Charater Count')
a=data_english.num_exclamation_marks.value_counts()
bar=hv.Bars(a,label='Number Exclamation Marks')
bar.opts(color='orange',width=600,height=380, logy=False, tools=['hover'],xlabel='Exclamation Mark count', ylabel='Sentences Count')
hist=hv.Histogram(np.histogram(data_english['num_unique_words'],bins=100, normed=False),label='Histogram of Unique Words')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='violet',width=600,height=380, logy=False, tools=['hover'],xlabel='Unique Words Count')
Most of the unique words are in 5 to 20
hist=hv.Histogram(np.histogram(data_english['polarity'],bins=100, normed=False),label='Histogram of Sentiment Polarity')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='indigo',width=600,height=380, logy=False, tools=['hover'],xlabel='Sentiment Polarity')
Vast majority of the sentiment polarity scores are greater than zero, means most of them are pretty positive.
5 random reviews with the highest positive sentiment polarity
print('5 random reviews with the highest positive sentiment polarity: \n')
cl = data_english.loc[data_english.polarity == 1, ['raw_text']].sample(5).values
for c in cl:
print(c[0])
5 random reviews with the most neutral sentiment(zero) polarity
print('5 random reviews with the most neutral sentiment(zero) polarity: \n')
cl = data_english.loc[data_english.polarity == 0, ['raw_text']].sample(5).values
for c in cl:
print(c[0])
random reviews with the most negative polarity
print('5 reviews with the most negative polarity: \n')
cl = data_english.loc[data_english.polarity == -1, ['raw_text']].sample(5).values
for c in cl:
print(c[0])
Before reviewing Text Features we need to first clean the text data, remove punctuations, stopwords, lemmatize the text data
#Processing Data
#Removing Punctuation and Stopwords
import string
all_punctuations = string.punctuation + '‘’\n,""'':”][]!'
def remove_puctuation(text):
doc = nlp(str(text))
no_punctuation = [char for char in text if char not in all_punctuations]
no_punctuation = ''.join(no_punctuation)
return no_punctuation
def remove_stop_words(text):
doc = nlp(str(text))
# result = [token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc.sents]
result=[word for word in text.split() if word not in english_stopwords]
# mytokens = [ word for word in text.split() if word not in STOP_WORDS]
return " ".join(result)
#lemmatizing the words i.e converting words to their base words e.g- literals->literal, best->good
def lemmatize(text):
doc = nlp(str(text))
return " ".join([token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc])
all_punctuations
# Removing StopWords
data_english['processed_raw_text']=data_english['raw_text'].apply(remove_puctuation)
import plotly.graph_objects as go
import chart_studio.plotly as py
fig = go.Figure(go.Histogram(x=data_english.polarity,marker_color='#EB89B5'))
fig.show()
#Applying Lemmetizing
data_english['processed_raw_text']=data_english['processed_raw_text'].apply(lemmatize)
#Removing Stopwords and creating new column
data_english['processed_wo_stopwords_raw_text']=data_english['processed_raw_text'].apply(remove_stop_words)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
text = str(data_english['processed_raw_text'])
wordcloud = WordCloud(max_font_size=100, max_words=1000, background_color="white",width=1200,height=650,colormap="icefire").generate(text)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
From above WordCloud we can see that PHONE, BOOK, ONE, READ, UPDATE have high frequency i.e. they are occuring more often
we need to extract N-Gram features. N-grams are used to describe the number of words used as observation points, e.g., unigram means singly-worded, bigram means 2-worded phrase, and trigram means 3-worded phrase. In order to do this, we use scikit-learn’s CountVectorizer function.First, it would be interesting to compare unigrams before and after removing stop words.
The distribution of top unigrams before removing stop words
from sklearn.feature_extraction.text import CountVectorizer
vector=CountVectorizer()
def top_words(text):
vector=CountVectorizer()
bag_of_words=vector.fit_transform(data_english['processed_raw_text'])
sum_of_words= bag_of_words.sum(axis=0)
frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
return frequency
words=top_words(data_english['processed_raw_text'])
df= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df=df.head(20)
fig = go.Figure([go.Bar(x=df['ReviewText'], y=df['count'],text=df['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 words in review before removing stop words',)
fig.update_traces(marker_color='green', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
The distribution of top unigrams after removing stop words
def top_words_wo_stop(text):
vector=CountVectorizer(stop_words='english')
bag_of_words=vector.fit_transform(data_english['processed_raw_text'])
sum_of_words= bag_of_words.sum(axis=0)
frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
return frequency
words=top_words_wo_stop(data_english['processed_raw_text'])
df1= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df1=df1.head(20)
fig = go.Figure([go.Bar(x=df1['ReviewText'], y=df1['count'],text=df1['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 words in review after removing stop words',)
fig.update_traces(marker_color='blue', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
The distribution of top bigrams before removing stop words
def top_words_wo_stop(text):
vector=CountVectorizer(ngram_range=(2,2))
bag_of_words=vector.fit_transform(data_english['processed_raw_text'])
sum_of_words= bag_of_words.sum(axis=0)
frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
return frequency
words=top_words_wo_stop(data_english['processed_raw_text'])
df2= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df2=df2.head(20)
fig = go.Figure([go.Bar(x=df2['ReviewText'], y=df2['count'],text=df2['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 bigrams in review before removing stop words',)
fig.update_traces(marker_color='black', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
The distribution of top bigrams after removing stop words
def top_words_wo_stop(text):
vector=CountVectorizer(stop_words=english_stopwords,ngram_range=(2,2))
bag_of_words=vector.fit_transform(data_english['processed_raw_text'])
sum_of_words= bag_of_words.sum(axis=0)
frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
return frequency
words=top_words_wo_stop(data_english['processed_raw_text'])
df3= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df3=df3.head(20)
fig = go.Figure([go.Bar(x=df3['ReviewText'], y=df3['count'],text=df3['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 bigrams in review after removing stop words',)
fig.update_traces(marker_color='maroon', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
The distribution of Top trigrams before removing stop words
def top_words_wo_stop(text):
vector=CountVectorizer(ngram_range=(3,3))
bag_of_words=vector.fit_transform(data_english['processed_raw_text'])
sum_of_words= bag_of_words.sum(axis=0)
frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
return frequency
words=top_words_wo_stop(data_english['processed_raw_text'])
df4= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df4=df4.head(20)
fig = go.Figure([go.Bar(x=df4['ReviewText'], y=df4['count'],text=df4['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 trigrams in review after removing stop words',)
fig.update_traces(marker_color='steelblue', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
The distribution of Top trigrams after removing stop words
def top_words_wo_stop(text):
vector=CountVectorizer(ngram_range=(3,3),stop_words=english_stopwords)
bag_of_words=vector.fit_transform(data_english['processed_raw_text'])
sum_of_words= bag_of_words.sum(axis=0)
frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
return frequency
words=top_words_wo_stop(data_english['processed_raw_text'])
df5= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df5=df5.head(20)
fig = go.Figure([go.Bar(x=df5['ReviewText'], y=df5['count'],text=df5['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 trigrams in review after removing stop words')
fig.update_traces(marker_color='rebeccapurple', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
Converting polarity values into POSITIVE , NEUTRAL and NEGATIVE
#into POSITIVE (polarity >= 0.2) & (polarity <= 1)
#For NEUTRAL (polarity >= 0) & (polarity < 0.2)
#For NEGATIVE (polarity >= -1) & (polarity < 0)
data_english.loc[(data_english.polarity >= 0.2) & (data_english.polarity <= 1),'sentiment'] = 'positive'
data_english.loc[(data_english.polarity >= 0) & (data_english.polarity < 0.2),'sentiment'] = 'neutral'
data_english.loc[(data_english.polarity >= -1) & (data_english.polarity < 0), 'sentiment'] = 'negative'
y=data_english.groupby('sentiment')['processed_raw_text'].count()
y=pd.DataFrame(y)
y.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=y['sentiment'], y=y['processed_raw_text'],text=y['processed_raw_text'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Total number of Sentiment in each category')
fig.update_traces(marker_color='dodgerblue', marker_line_color='black',marker_line_width=1.5, opacity=0.8)
fig.update_xaxes(tickangle=330,title='Sentiment')
fig.update_yaxes(title='Count')
fig.show()
Creating Radar Plot
Data = [
go.Scatterpolar(
r = [data_english.loc[data_english["sentiment"] =="positive"]['total_length'].median(),
data_english.loc[data_english["sentiment"] =="positive"]['word_count'].median(),
data_english.loc[data_english["sentiment"] =="positive"]['num_unique_words'].median(),
data_english.loc[data_english["sentiment"] =="positive"]['caps_vs_length'].median(),
data_english.loc[data_english["sentiment"] =="positive"]['char_count'].median()],
theta = ['Total_Lenght','WordCount',
'Count_unique_words', "Capitals_VS_Length", "Charcount"],
fill = 'toself',
line = dict( color = 'brown'),
name= "Positive Statistics", subplot = "polar"),
go.Scatterpolar(
r = [data_english.loc[data_english["sentiment"] =="neutral"]['total_length'].median(),
data_english.loc[data_english["sentiment"] =="neutral"]['word_count'].median(),
data_english.loc[data_english["sentiment"] =="neutral"]['num_unique_words'].median(),
data_english.loc[data_english["sentiment"] =="neutral"]['caps_vs_length'].median(),
data_english.loc[data_english["sentiment"] =="neutral"]['char_count'].median()],
theta = ['Total_Lenght','WordCount',
'Count_unique_words', "Capitals_VS_Length", "Charcount"],
fill = 'toself',
line = dict( color = 'magenta'),
name= "Neutral Statistics", subplot = "polar2"),
go.Scatterpolar(
r = [data_english.loc[data_english["sentiment"] =="negative"]['total_length'].median(),
data_english.loc[data_english["sentiment"] =="negative"]['word_count'].median(),
data_english.loc[data_english["sentiment"] =="negative"]['num_unique_words'].median(),
data_english.loc[data_english["sentiment"] =="negative"]['caps_vs_length'].median(),
data_english.loc[data_english["sentiment"] =="negative"]['char_count'].median()],
theta = ['Total_Lenght','WordCount',
'Count_unique_words', "Capitals_VS_Length", "Charcount"],
fill = 'toself',
line = dict( color = 'orange'),
name= "Negative Statistics", subplot = "polar3"),
]
layout = go.Layout(
polar3 = dict(
domain = dict(
x = [0, 0.6],
y = [0.55, 1]
),
radialaxis = dict(visible = True,)),
polar2 = dict(
domain = dict(
x = [0, 0.35],
y = [0, 0.45]
),
radialaxis = dict(visible = True,)),
polar = dict(
domain = dict(
x = [0.33, 0.999999],
y = [0, 0.45]
),
radialaxis = dict(visible = True,)),
title = "Comapring Median")
fig = go.Figure(data=Data, layout=layout)
fig.show()
Finding NER using SPACY
from spacy import displacy
doc_eng=nlp(str(data_english['processed_raw_text']))
#Removing whitespaces from the data
import re
sample=[]
for sent in doc_eng.sents:
sent = re.sub("\s+"," ",sent.text) # clean up the whitespace
print(sent,"\n")
sample.append(sent)
doc2=nlp(str(sample))
displacy.render(doc2,style='ent',jupyter=True)
text=[]
pos=[]
pos_tag=[]
sentences=[]
for sent in sample:
parsed_sentence=nlp(str(sample))
for token in parsed_sentence:
text.append(token.text)
pos.append(token.pos_)
pos_tag.append(token.tag_)
sentences.append(token)
POS=pd.DataFrame({'sentence':sentences,'text':text,'pos':pos,'pos_tag':pos_tag})
# print('The total number of entities detected were:{}'.format(len(Entities)))
POS.head(7)
Top 20 VERB in the text'
a=POS.groupby(['pos','text'])['pos'].count()
a=POS[POS.pos == 'VERB']
a= a.groupby('text')['pos'].count().sort_values(ascending=False).head(20)
a=pd.DataFrame(a)
a.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=a['text'], y=a['pos'],text=a['pos'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 VERB in the text',)
fig.update_traces(marker_color='maroon', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
b=POS.groupby(['pos','text'])['pos'].count()
b=POS[POS.pos == 'NOUN']
b= b.groupby('text')['pos'].count().sort_values(ascending=False).head(20)
b=pd.DataFrame(b)
b.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=b['text'], y=b['pos'],text=b['pos'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 NOUN in the text',)
fig.update_traces(marker_color='chocolate', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
c=POS.groupby(['pos','text'])['pos'].count()
c=POS[POS.pos == 'ADJ']
c= c.groupby('text')['pos'].count().sort_values(ascending=False).head(20)
c=pd.DataFrame(c)
c.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=c['text'], y=c['pos'],text=c['pos'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 ADJ in the text',)
fig.update_traces(marker_color='darkturquoise', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
d=POS.groupby(['pos','text'])['pos'].count()
d=POS[POS.pos == 'ADV']
d= d.groupby('text')['pos'].count().sort_values(ascending=False).head(20)
d=pd.DataFrame(d)
d.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=d['text'], y=d['pos'],text=d['pos'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 ADV in the text',)
fig.update_traces(marker_color='olive', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
Performing Topic Modeling on the dataset
import re
import gensim
from gensim import corpora
# libraries for visualization
import pyLDAvis
import pyLDAvis.gensim
#Creating Tokens
def lemmatization(texts, tags=['NOUN', 'ADJ']):
output = []
for sent in texts:
doc = nlp(" ".join(sent))
output.append([token.lemma_ for token in doc if token.pos_ in tags])
return output
#Tokenizing sentences
tokenized_reviews = pd.Series(data_english['processed_wo_stopwords_raw_text']).apply(lambda x: x.split())
print(tokenized_reviews[0])
#creating the term dictionary of our corpus, where every unique term is assigned an index
dictionary = corpora.Dictionary(tokenized_reviews)
%%time
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokenized_reviews]
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel
# Build LDA model
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary,
num_topics=5, random_state=100, chunksize=1000,
passes=30)
lda_model.print_topics()
lda_model.save('model1.gensim')
pyLDAvis.enable_notebook()
lda_display = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)
performing Sentiment Analysis on the processed_wo_stopwords_raw_text
features=data_english['processed_wo_stopwords_raw_text']
labels=data_english['sentiment']
from sklearn.model_selection import train_test_split
# Split X and y into training and test set in 70:20 ratio
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, random_state=1)
#Creating a bag-of-words dictionary of words from the data
bow_dictionary = CountVectorizer().fit(x_train)
#Total number of words in the bow_dictionary
len(bow_dictionary.vocabulary_)
#Using the bow_dictionary to create count vectors for the cleaned data.
bow = bow_dictionary.transform(x_train)
#Printing the shape of the bag of words model
print(bow.shape)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(x_train)
# summarize
# print(vectorizer.vocabulary_)
print(vectorizer.idf_)
# encode document
train_features = vectorizer.transform(x_train)
print(train_features.shape)
# summarize encoded vector
test_features = vectorizer.transform(x_test)
from sklearn.naive_bayes import MultinomialNB
#Fitting the training data to the classifier
classifier = MultinomialNB().fit(train_features, y_train)
#Predicting test data
predicted=classifier.predict(test_features)
# Training accuracy
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
print("The training accuracy is: ")
print(accuracy_score(y_train, classifier.predict(train_features)))
From above graph we can see that accuracy of Naive Bayes is 74.5%
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(y_test,predicted))
# Classification report
print("Classification report")
print(classification_report(y_test,predicted))
conf_matrix = confusion_matrix(y_test, predicted)
conf_matrix
#Importing indonesian Stopwords
from spacy.lang.id import STOP_WORDS
indonesian_stopwords=STOP_WORDS
#Viewing top 15 indonesian stopwords
mylist=list(indonesian_stopwords)
mylist[:15]
#we have seperated the dataset above
data_indonesia.head(10)
#function to convert Language
def convert(text):
a=TextBlob(str(text))
result=a.translate(to='en')
return result
#Reviewing top 5 sentences from English and comparing it with Indonesian language
print("top 5 sentences of raw_text is: \n ",data['raw_text'][:10])
print("\n top 5 sentences of review_text is:\n ",convert(data['review_text'][:10]))
#finding and dropping rows having NULL values
print('\n before removing null values')
print(data_indonesia.isnull().sum())
data_indonesia.dropna(inplace=True)
print('\n after removing null values')
print(data_indonesia.isnull().sum())
#Converting Indonesian text to English
english_text=convert(data_indonesia['review_text'])
#correcting text
english_text.correct()
import string
def feature(data) :
## Finding the total number of words present in the each sentences
data['word_count'] = data['review_text'].apply(lambda x : len(x.split()))
#total number of Characters in a sentence
data['char_count'] = data['review_text'].apply(lambda x : len(x.replace(" ","")))
#calculating word density
data['word_density'] = data['word_count'] / (data['char_count'] + 1)
#calculating punctuations in sentences
data['punctuation']=data['review_text'].apply(lambda x: len([x for x in x.split() if x in string.punctuation]))
# calculating total length of sentence
data['total_length'] = data['review_text'].apply(len)
#Total number of Numeric characters
data['numerics']=data['review_text'].apply(lambda x: len([x for x in x if x.isdigit()]))
#findng total number of stopwords in each sentence
data['stopwords']=data['review_text'].apply(lambda x:len([x for x in x.lower().split() if x in STOP_WORDS]))
# finding total number of capital letters in sentence
data['capitals']=data['review_text'].apply(lambda x: len([x for x in x.split(" ") if x.isupper()]))
data['caps_vs_length'] = data.apply(lambda row: float(row['capitals'])/float(row['total_length']),axis=1)
#calculating total number of exclamation_marks in sentence
data['num_exclamation_marks'] =data['review_text'].apply(lambda x: x.count('!'))
#calculating total number of question_marks in sentence
data['num_question_marks'] = data['review_text'].apply(lambda x: x.count('?'))
#calculating punctuations in sentences
data['num_punctuation'] = data['review_text'].apply(lambda x: sum(x.count(w) for w in '.,;:'))
#calculating total number of symbols in sentences
data['num_symbols'] = data['review_text'].apply(lambda x: sum(x.count(w) for w in '*&#$%'))
#calculating total number of unique words in sentences
data['num_unique_words'] = data['review_text'].apply(lambda x: len(set(w for w in x.split())))
#calculating words_vs_unique
data['words_vs_unique'] = data['num_unique_words'] / data['word_count']
#calculating word_unique_percent
data["word_unique_percent"] = data["num_unique_words"]*100/data['word_count']
#calculating Sentiment
data['polarity']=data['review_text'].apply(lambda text: TextBlob(text).sentiment.polarity)
return data
analyzing Indonesian Language
feature(data_indonesia)
hist=hv.Histogram(np.histogram(data_indonesia['word_count'],bins=100, normed=False),label='Histogram of Word Count')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='red',width=600, height=380, logy=False, tools=['hover'],xlabel=' Word Count')
hist=hv.Histogram(np.histogram(data_indonesia['char_count'],bins=100, normed=False),label='Histogram of Charater Count')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='green',width=600,height=380, logy=False, tools=['hover'],xlabel=' Charater Count')
hist=hv.Histogram(np.histogram(data_indonesia['word_density'],bins=100, normed=False),label='Histogram of Word Density')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='blue',width=600,height=380, logy=False, tools=['hover'],xlabel=' Word Density')
hist=hv.Histogram(np.histogram(data_indonesia['total_length'],bins=100, normed=False),label='Histogram of Total Length')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='maroon',width=600,height=380, logy=False, tools=['hover'],xlabel='Total Length')
a=data_indonesia.numerics.value_counts()
bar=hv.Bars(a,label='Numeric Value Count')
bar.opts(width=600,height=380, logy=False, tools=['hover'],xlabel='Numeric Value', ylabel='Sentences Count')
hist=hv.Histogram(np.histogram(data_indonesia['capitals'],bins=100, normed=False),label='Histogram of Capital Charater Count')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='indigo',width=600,height=380, logy=False, tools=['hover'],xlabel=' Capital Charater Count')
a=data_indonesia.num_exclamation_marks.value_counts()
bar=hv.Bars(a,label='Number Exclamation Marks')
bar.opts(color='orange',width=600,height=380, logy=False, tools=['hover'],xlabel='Exclamation Mark count', ylabel='Sentences Count')
hist=hv.Histogram(np.histogram(data_indonesia['num_unique_words'],bins=100, normed=False),label='Histogram of Unique Words')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='violet',width=600,height=380, logy=False, tools=['hover'],xlabel='Unique Words Count')
hist=hv.Histogram(np.histogram(data_indonesia['polarity'],bins=100, normed=False),label='Histogram of Sentiment Polarity')
# hist.opts(width=600, logy=True, tools=['hover'])
hist.opts(color='indigo',width=600,height=380, logy=False, tools=['hover'],xlabel='Sentiment Polarity')
#5 random reviews with the highest positive sentiment polarity
print('5 random reviews with the highest positive sentiment polarity: \n')
cl = data_indonesia.loc[data_indonesia.polarity == 1, ['review_text']].sample(5).values
for c in cl:
print(c[0])
#5 random reviews with the highest neutral sentiment polarity
print('5 random reviews with the highest positive sentiment polarity: \n')
cl = data_indonesia.loc[data_indonesia.polarity == 0, ['review_text']].sample(5).values
for c in cl:
print(c[0])
#5 random reviews with the highest negative sentiment polarity
print('5 random reviews with the highest positive sentiment polarity: \n')
cl = data_indonesia.loc[data_indonesia.polarity == -1, ['review_text']].sample(3).values
for c in cl:
print(c[0])
# Removing StopWords
data_indonesia['processed_raw_text']=data_indonesia['review_text'].apply(remove_puctuation)
#Applying Lemmetizing
data_indonesia['processed_raw_text']=data_indonesia['processed_raw_text'].apply(lemmatize)
#Removing Stopwords and creating new column
def remove_stop_words(text):
doc = nlp(str(text))
# result = [token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc.sents]
result=[word for word in text.split() if word not in indonesian_stopwords]
# mytokens = [ word for word in text.split() if word not in STOP_WORDS]
return " ".join(result)
data_indonesia['processed_wo_stopwords_raw_text']=data_indonesia['processed_raw_text'].apply(remove_stop_words)
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
text = str(data_indonesia['processed_raw_text'])
wordcloud = WordCloud(max_font_size=100, max_words=1000, background_color="white",width=1200,height=650,colormap="icefire").generate(text)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
The distribution of top unigrams before removing stop words
def top_words(text):
vector=CountVectorizer()
bag_of_words=vector.fit_transform(data_indonesia['processed_raw_text'])
sum_of_words= bag_of_words.sum(axis=0)
frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
return frequency
words=top_words(data_indonesia['processed_raw_text'])
df6= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df6=df6.head(20)
fig = go.Figure([go.Bar(x=df6['ReviewText'], y=df6['count'],text=df6['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 words in review before removing stop words',)
fig.update_traces(marker_color='green', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
def top_words_wo_stop(text):
vector=CountVectorizer(stop_words=indonesian_stopwords)
bag_of_words=vector.fit_transform(data_indonesia['processed_raw_text'])
sum_of_words= bag_of_words.sum(axis=0)
frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
return frequency
words=top_words_wo_stop(data_indonesia['processed_raw_text'])
df7= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df7=df7.head(20)
fig = go.Figure([go.Bar(x=df7['ReviewText'], y=df7['count'],text=df7['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 words in review after removing stop words',)
fig.update_traces(marker_color='blue', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
def top_words_wo_stop(text):
vector=CountVectorizer(ngram_range=(2,2))
bag_of_words=vector.fit_transform(data_indonesia['processed_raw_text'])
sum_of_words= bag_of_words.sum(axis=0)
frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
return frequency
words=top_words_wo_stop(data_indonesia['processed_raw_text'])
df8= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df8=df8.head(20)
fig = go.Figure([go.Bar(x=df8['ReviewText'], y=df8['count'],text=df8['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 bigrams in review before removing stop words',)
fig.update_traces(marker_color='gray', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
def top_words_wo_stop(text):
vector=CountVectorizer(stop_words=indonesian_stopwords,ngram_range=(2,2))
bag_of_words=vector.fit_transform(data_indonesia['processed_raw_text'])
sum_of_words= bag_of_words.sum(axis=0)
frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
return frequency
words=top_words_wo_stop(data_indonesia['processed_raw_text'])
df9= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df9=df9.head(20)
fig = go.Figure([go.Bar(x=df9['ReviewText'], y=df9['count'],text=df9['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 bigrams in review after removing stop words',)
fig.update_traces(marker_color='maroon', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
def top_words_wo_stop(text):
vector=CountVectorizer(ngram_range=(3,3))
bag_of_words=vector.fit_transform(data_indonesia['processed_raw_text'])
sum_of_words= bag_of_words.sum(axis=0)
frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
return frequency
words=top_words_wo_stop(data_indonesia['processed_raw_text'])
df10= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df10=df10.head(20)
fig = go.Figure([go.Bar(x=df10['ReviewText'], y=df10['count'],text=df10['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 trigrams in review before removing stop words',)
fig.update_traces(marker_color='orange', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
def top_words_wo_stop(text):
vector=CountVectorizer(stop_words=indonesian_stopwords,ngram_range=(3,3))
bag_of_words=vector.fit_transform(data_indonesia['processed_raw_text'])
sum_of_words= bag_of_words.sum(axis=0)
frequency=[(word, sum_of_words[0, idx]) for word, idx in vector.vocabulary_.items()]
frequency=sorted(frequency, key=lambda x: x[1], reverse=True)
return frequency
words=top_words_wo_stop(data_indonesia['processed_raw_text'])
df10= pd.DataFrame(words, columns = ['ReviewText' , 'count'])
df10=df10.head(20)
fig = go.Figure([go.Bar(x=df10['ReviewText'], y=df10['count'],text=df10['count'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 trigrams in review after removing stop words',)
fig.update_traces(marker_color='green', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
#into POSITIVE (polarity >= 0.2) & (polarity <= 1)
#For NEUTRAL (polarity >= 0) & (polarity < 0.2)
#For NEGATIVE (polarity >= -1) & (polarity < 0)
data_indonesia.loc[(data_indonesia.polarity >= 0.2) & (data_indonesia.polarity <= 1),'sentiment'] = 'positive'
data_indonesia.loc[(data_indonesia.polarity >= 0) & (data_indonesia.polarity < 0.2),'sentiment'] = 'neutral'
data_indonesia.loc[(data_indonesia.polarity >= -1) & (data_indonesia.polarity < 0), 'sentiment'] = 'negative'
y=data_indonesia.groupby('sentiment')['processed_raw_text'].count()
y=pd.DataFrame(y)
y.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=y['sentiment'], y=y['processed_raw_text'],text=y['processed_raw_text'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Total number of Sentiment in each category')
fig.update_traces(marker_color='dodgerblue', marker_line_color='black',marker_line_width=1.5, opacity=0.8)
fig.update_xaxes(tickangle=330,title='Sentiment')
fig.update_yaxes(title='Count')
fig.show()
#Removing whitespaces from the data
import re
doc_indo=nlp(str(data_indonesia['processed_raw_text']))
sample=[]
for sent in doc_indo.sents:
sent = re.sub("\s+"," ",sent.text) # clean up the whitespace
print(sent,"\n")
sample.append(sent)
text=[]
pos=[]
pos_tag=[]
sentences=[]
for sent in sample:
parsed_sentence=nlp(str(sample))
for token in parsed_sentence:
text.append(token.text)
pos.append(token.pos_)
pos_tag.append(token.tag_)
sentences.append(token)
POS2=pd.DataFrame({'sentence':sentences,'text':text,'pos':pos,'pos_tag':pos_tag})
# print('The total number of entities detected were:{}'.format(len(Entities)))
POS2.head()
b=POS2.groupby(['pos','text'])['pos'].count()
b=POS2[POS2.pos == 'NOUN']
b= b.groupby('text')['pos'].count().sort_values(ascending=False).head(20)
b=pd.DataFrame(b)
b.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=b['text'], y=b['pos'],text=b['pos'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 NOUN in the text',)
fig.update_traces(marker_color='chocolate', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
b=POS2.groupby(['pos','text'])['pos'].count()
b=POS2[POS2.pos == 'ADJ']
b= b.groupby('text')['pos'].count().sort_values(ascending=False).head(20)
b=pd.DataFrame(b)
b.reset_index(inplace=True)
fig = go.Figure([go.Bar(x=b['text'], y=b['pos'],text=b['pos'],textposition='auto',textangle=0)])
fig.update_layout(title_text='Top 20 ADJ in the text',)
fig.update_traces(marker_color='fuchsia', marker_line_color='black',marker_line_width=1.5, opacity=0.6)
fig.update_xaxes(tickangle=330,title='Words')
fig.update_yaxes(title='Count')
fig.show()
#Creating Tokens
def lemmatization(texts, tags=['NOUN', 'ADJ','ADV']):
output = []
for sent in texts:
doc = nlp(" ".join(sent))
output.append([token.lemma_ for token in doc if token.pos_ in tags])
return output
#Tokenizing sentences
tokenized_reviews2 = pd.Series(data_indonesia['processed_wo_stopwords_raw_text']).apply(lambda x: x.split())
print(tokenized_reviews2[0])
#creating the term dictionary of our corpus, where every unique term is assigned an index
dictionary2 = corpora.Dictionary(tokenized_reviews2)
doc_term_matrix2 = [dictionary2.doc2bow(rev) for rev in tokenized_reviews2]
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel
# Build LDA model
lda_model2 = LDA(corpus=doc_term_matrix2, id2word=dictionary2,
num_topics=5, random_state=100, chunksize=1000,
passes=15)
lda_model2.print_topics()
lda_model2.save('model2.gensim')
pyLDAvis.enable_notebook()
lda_display = pyLDAvis.gensim.prepare(lda_model2, doc_term_matrix2, dictionary2, sort_topics=False)
pyLDAvis.display(lda_display)